We need to import the intersect data from our data. Here is what our data looks like:
scaffold qstart qend query type seqname seqstart seqend attribute
---
1 39310 39334 uce-127282_p11 intron 1 39281 39334
1 39334 39419 uce-127282_p11 exon 1 39335 39463 Parent=transcript:ENSMPTT00005003008;constitutive=0;exon_id=ENSMPTE00005013128;rank=8;version=1
...
1 108104 108225 uce-146693_p5 intergenic 1 72314 146673
...
(how intersect was created can be found here: https://github.com/crcardenas/Adephaga_UCE/blob/main/workflow.md)
library(tidyr) # data clean up
library(dplyr) # data cleanup
library(readr) # for importing
library(ggplot2) # for plotting
library(scales) # additional package for plotting
library(ggtext) # additional package for plotting
#library(psych) # for statistics, if necessary
# library(GenomicFeatures) # my not actually need this since we are doing our own thing
We have two files because of different GFF attribute fields for genes and exons. These can be joined later for manipulation, but may not need to be. There is no header information so we will need to add the header info I described above
d.intro_exon <- read_tsv(file="../Adephaga2.9-pterMadi2.introns-exons.out.intersect", col_names = F, na = c("","NA"))
## Rows: 14952 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): X1, X4, X5, X6, X9
## dbl (4): X2, X3, X7, X8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d.inter_gene <- read_tsv(file="../Adephaga2.9-pterMadi2.intergenic-genentic.out.intersect", col_names = F, na = c("","NA"))
## Rows: 15956 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): X1, X4, X5, X6, X9
## dbl (4): X2, X3, X7, X8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(d.intro_exon) <- c("scaffold","qstart","qend","query","type","seqname","seqstart","seqend","attribute")
colnames(d.inter_gene) <- c("scaffold","qstart","qend","query","type","seqname","seqstart","seqend","attribute")
d.intro_exon
d.inter_gene
df.intro_exon <- d.intro_exon %>% separate_wider_delim(cols = query,
delim = "_",
names = c("uce","probe")) %>%
separate_wider_delim(cols = attribute,
delim = ";",
names = c("parent","constitutive","ID","rank","version")) %>%
separate_wider_delim(cols = parent,
delim = ":",
names = c("parent","transcript")) %>%
separate_wider_delim(cols = ID,
delim = "=",
names = c("gff_attribute","exon_id")) %>%
mutate(query=paste(uce,probe, sep="_")) %>%
select(scaffold, qstart, qend, query, uce, probe, type, seqname, seqstart, seqend, transcript, exon_id)
df.intro_exon
df.inter_gene <- d.inter_gene %>% separate_wider_delim(cols = query,
delim = "_",
names = c("uce","probe")) %>%
separate_wider_delim(cols = attribute,
delim = ";",
names = c("ID","biotype","geneID","version")) %>%
separate_wider_delim(cols = biotype,
delim = "=",
names = c("gff_attribute1","biotype")) %>%
separate_wider_delim(cols = ID,
delim = ":",
names = c("gff_attribute2","gene_id")) %>%
mutate(query=paste(uce,probe, sep="_")) %>%
select(scaffold, qstart, qend, query, uce, probe, type, seqname, seqstart, seqend, biotype, gene_id)
df.inter_gene
UCEs that map to exons
# df.intro_exon %>% distinct(uce, .keep_all = T) %>% count(type)
#
# #uce_by_exon <- df.intro_exon %>% distinct(uce, .keep_all = T) %>% group_by(exon_id) %>% summarize(uce_count = sum(uce >= 1))
#
# uce_by_exon2 <- uce_by_exon %>% filter(uce_count > 1) %>% na.omit()
#
# psych::describe(uce_by_exon2)
# df.inter_gene %>% distinct(uce, .keep_all = T) %>% count(type)
#
# uce_count_by_gene <- df.inter_gene %>% distinct(uce, .keep_all = T) %>% group_by(gene_id) %>% summarize(uce_count = sum(uce >= 1))
#
# uce_count_by_gene2 <- uce_count_by_gene %>% filter(uce_count > 1) %>% na.omit() # na.omit gets rid of the intergenic UCEs (all 769 of them)
#
# psych::describe(na.omit(uce_count_by_gene$uce_count)) # includes the 769 loci coming from the intergenic regions
# psych::describe(na.omit(uce_count_by_gene2$uce_count))
we need to make one more category that best characterizes as genetic, intergenic, or both
scaffold qstart qend query uce probe type seqname seqstart seqend biotype gene_id
<chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
1 1 1011217 1011282 uce-71245_p7 uce-71245 p7 gene 1 982581 1011637 protein_coding ENSMPTG00005003053
2 1 1011217 1011329 uce-71245_p8 uce-71245 p8 gene 1 982581 1011637 protein_coding ENSMPTG00005003053
3 1 1018428 1018429 uce-267689_p7 uce-267689 p7 intergenic 1 1011637 1018429 NA NA
4 1 1018429 1018545 uce-267689_p7 uce-267689 p7 gene 1 1018430 1035525 protein_coding ENSMPTG00005001153
see second UCE here, it is both intergenic and genetic… these are the only variables we are worried about right now.
category.inter_gene <- df.inter_gene %>%
group_by(uce) %>%
mutate(category = if (n_distinct(type) > 1) 'intergenic_genetic' else unique(type)) %>%
ungroup() %>%
select(scaffold, uce, probe, type, category)
category.inter_gene.all <- category.inter_gene %>% distinct(uce, .keep_all = T) %>% select(category) %>% table() %>% as_tibble()
category.inter_gene.all
category.inter_gene.bychromosome <- category.inter_gene %>% group_by(scaffold) %>% distinct(uce, .keep_all = T) %>% select(category) %>% table()
## Adding missing grouping variables: `scaffold`
category.inter_gene.bychromosome
## category
## scaffold gene intergenic intergenic_genetic
## 1 106 38 10
## 10 115 36 7
## 11 114 47 5
## 12 154 40 8
## 13 60 12 4
## 14 57 11 5
## 15 15 2 2
## 16 28 11 3
## 17 54 19 5
## 18 9 2 2
## 2 176 93 15
## 3 139 47 10
## 4 85 35 4
## 5 117 62 13
## 6 121 32 9
## 7 154 57 6
## 8 98 20 7
## 9 127 112 14
## X 136 37 8
#create our dataframe
category.df.inter_gene.all <- data.frame(
genomic_feature= category.inter_gene.all$.,
uce_count = category.inter_gene.all$n) %>%
mutate( proportion = round(uce_count / sum(uce_count), 4))
category.df.inter_gene.all
# create a pie chart
pb.category.inter_gene.all <- category.df.inter_gene.all %>%
ggplot(aes(x="", y=proportion, fill=reorder(genomic_feature,proportion))) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
scale_fill_grey() +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust=0.5)) +
geom_text(aes(label = percent(proportion, accuracy = 0.01), fontface=2),
position = position_stack(vjust=0.5)) +
ggtitle("UCE characterized as genetic or intergenic") +
labs(fill="Genomic feature") +
guides(fill=guide_legend(reverse=T))
pb.category.inter_gene.all
# df.intro_exon %>% distinct(uce, .keep_all = T) %>% count(type)
#
# #uce_by_exon <- df.intro_exon %>% distinct(uce, .keep_all = T) %>% group_by(exon_id) %>% summarize(uce_count = sum(uce >= 1))
#
# uce_by_exon2 <- uce_by_exon %>% filter(uce_count > 1) %>% na.omit()
#
# psych::describe(uce_by_exon2)
category.intro_exon <- df.intro_exon %>%
group_by(uce) %>%
mutate(category = if (n_distinct(type) > 1) 'intron_exon' else unique(type)) %>%
ungroup() %>%
select(scaffold, uce, probe, type, category)
category.intro_exon.all <- category.intro_exon %>% distinct(uce, .keep_all = T) %>% select(category) %>% table() %>% as_tibble()
category.intro_exon.all
category.intro_exon.bychromosome <- category.intro_exon %>% group_by(scaffold) %>% distinct(uce, .keep_all = T) %>% select(category) %>% table()
## Adding missing grouping variables: `scaffold`
category.intro_exon.bychromosome
## category
## scaffold exon intron intron_exon
## 1 66 5 45
## 10 69 4 47
## 11 64 5 52
## 12 106 7 50
## 13 36 2 26
## 14 34 2 27
## 15 9 0 8
## 16 22 1 8
## 17 34 4 21
## 18 9 0 2
## 2 113 14 66
## 3 86 7 58
## 4 46 4 40
## 5 78 9 46
## 6 68 6 56
## 7 110 4 47
## 8 58 3 46
## 9 73 13 57
## X 60 18 67
#create our dataframe
category.df.intro_exon.all <- data.frame(
genomic_feature= category.intro_exon.all$.,
uce_count = category.intro_exon.all$n) %>%
mutate( proportion = round(uce_count / sum(uce_count), 4)) %>%
arrange(desc(proportion))
category.df.intro_exon.all
# create a pie chart
pb.category.intro_exon.all <- category.df.intro_exon.all %>%
ggplot(aes(x="", y=proportion, fill=reorder(genomic_feature,uce_count))) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
scale_fill_grey() +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust=0.5)) +
geom_text(aes(label = percent(proportion, accuracy = 0.01), fontface=2),
position = position_stack(vjust=0.5)) +
ggtitle("Genetic UCEs characterized as intron or exon") +
labs(fill="Gene feature") +
guides(fill=guide_legend(reverse=T))
pb.category.intro_exon.all
next we need to characterize the number of UCEs per gene. Here is an example from the df.inter_gene with three different UCEs mapping to the same gene
scaffold qstart qend query uce probe type seqname seqstart seqend biotype gene_id
X 35120719 35120839 uce-123899_p11 uce-123899 p11 gene X 35120216 35135278 protein_coding ENSMPTG00005012093
X 35120759 35120879 uce-123899_p12 uce-123899 p12 gene X 35120216 35135278 protein_coding ENSMPTG00005012093
X 35129993 35130087 uce-17802_p12 uce-17802 p12 gene X 35120216 35135278 protein_coding ENSMPTG00005012093
X 35130007 35130127 uce-17802_p11 uce-17802 p11 gene X 35120216 35135278 protein_coding ENSMPTG00005012093
X 35131551 35131671 uce-123823_p12 uce-123823 p12 gene X 35120216 35135278 protein_coding ENSMPTG00005012093
X 35131591 35131711 uce-123823_p11 uce-123823 p11 gene X 35120216 35135278 protein_coding ENSMPTG00005012093
category.gene_uce <- df.inter_gene %>%
group_by(gene_id) %>%
mutate(gene_id =
if (type == 'intergenic') 'intergenic' else gene_id) %>% # ignoring warning for now
mutate(uce_count =
case_when(
n_distinct(uce) == 1 ~ 'n=1',
n_distinct(uce) == 2 ~ 'n=2',
n_distinct(uce) == 3 ~ 'n=3',
n_distinct(uce) == 4 ~ 'n=4',
n_distinct(uce) == 5 ~ 'n=5')) %>%
ungroup() %>%
select(scaffold, uce, probe, type, gene_id, uce_count)
category.gene_uce
category.gene_uce.all <- category.gene_uce %>% distinct(gene_id, .keep_all = T) %>% select(uce_count) %>% table() %>% as_tibble()
category.gene_uce.all
category.gene_uce.bychromosome <- category.gene_uce %>% group_by(scaffold) %>% distinct(gene_id, .keep_all = T) %>% select(uce_count) %>% table()
## Adding missing grouping variables: `scaffold`
category.gene_uce.bychromosome
## uce_count
## scaffold n=1 n=2 n=3 n=4 n=5
## 1 84 10 4 0 0
## 10 96 10 3 0 0
## 11 85 9 4 0 1
## 12 111 26 1 0 0
## 13 59 4 0 0 0
## 14 48 5 2 0 0
## 15 16 2 0 0 0
## 16 27 2 0 0 0
## 17 44 5 2 0 0
## 18 9 1 0 0 0
## 2 125 26 5 1 0
## 3 110 17 2 1 0
## 4 55 12 4 0 0
## 5 80 18 3 1 1
## 6 89 12 5 1 0
## 7 109 15 5 2 0
## 8 70 17 1 0 1
## 9 101 17 3 0 0
## X 102 11 4 2 0
Next we create a data frame and plot our results, only plot N > 1 UCEs per gene
# create our data frame from our object class table
category.df.gene_uce.all <- data.frame(
uce_per_gene = category.gene_uce.all$.,
genes_total = category.gene_uce.all$n) %>%
filter(uce_per_gene!='n=1')
category.df.gene_uce.all
# create barplot
bp.gene_uce.all <- category.df.gene_uce.all %>%
ggplot(aes(x=uce_per_gene,y=genes_total)) +
geom_bar(stat="identity") +
annotate("text",x=3.5, y=150,
label=
"genetic UCEs: 1863\nintergenic UCEs: 710\nintergenic & genetic UCEs: 131 *\n
genes with >=1 UCEs: 1698\nexonic UCEs: 1141\nintronic & exonic UCEs 766 *\nintronic UCEs: 108") +
geom_text( aes(label=genes_total), vjust=-1) +
scale_y_continuous(limits=c(0,225)) +
labs(x="UCE per gene",
y="total genes",
title= expression("Adephaga 2.9k UCEs within *Pterostichus madidus* genes")) +
theme_bw() +
theme(plot.title = ggtext::element_markdown())
bp.gene_uce.all
Something is off in the counting, things arent adding up. I’ve identified 1698 genes, with one or more UCE (e.g., 219 with 2, 48 with 3, etc.) BUT the cumulative of all identfied exonic, intronic & exonic, and intronic UCEs is greater.
The total number of mapped UCEs is 2704, which tracks with the genetic/intergenic calculations. However, something is being counted wrong (or my interpretation of the counting is way off). 1698 genes with 1 or more UCEs mapped to them…. but when I count specific gene features (gff, intronict, or both) the sum is much greater. For example, 219 genes have two (438 UCEs), 48 genes have three (144), eight genes have four (32), and three have five (15); for a total of 629 multi-genic UCEs. There are 1420 genes with one UCE, plus the other genes = 1698 genes. cool cool cool. there are 629 + 1420 = 2049 genic UCEs. But this summatoin does not equal the 1863 genetic UCEs identified previously calculated (even including 131 both intergenic and genetic uces: 1994)
these calculations are all so off some how.
# uces_in_genes <- uce_count_by_gene2 %>% ggplot(aes(x=uce_count)) +
# geom_bar(stat="count") +
# annotate("text",
# x=4.5, y=150,
# label="intergenic UCEs: 770\ngenes with UCEs: 1934\n>1 UCE per gene: 257\nUCEs in exons: 1420\nUCEs in introns: 595") +
# geom_text(stat= 'count',
# aes(label=..count..),
# vjust=-1) +
# scale_y_continuous(limits=c(0,225)) +
# # theme_update(plot.title = element_text(hjust = 0.5)) +
# labs(x="UCE per gene",
# y="total genes",
# title= expression("Adephaga 2.9k UCEs present in *Pterostichus madidus* genome")) +
# theme_bw() +
# theme(plot.title = ggtext::element_markdown())
# uces_in_genes
Trying to solve the counting issue. genes w/ UCE
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggtext_0.1.2 scales_1.1.1 ggplot2_3.3.5 readr_2.1.3 dplyr_1.0.10
## [6] tidyr_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.0 xfun_0.36 bslib_0.4.2 purrr_1.0.1
## [5] colorspace_2.0-2 vctrs_0.5.2 generics_0.1.3 htmltools_0.5.4
## [9] yaml_2.3.7 utf8_1.2.2 rlang_1.0.6 gridtext_0.1.5
## [13] jquerylib_0.1.4 pillar_1.8.1 glue_1.6.2 withr_2.5.0
## [17] DBI_1.1.3 bit64_4.0.5 lifecycle_1.0.3 stringr_1.5.0
## [21] munsell_0.5.0 gtable_0.3.0 evaluate_0.15 labeling_0.4.2
## [25] knitr_1.37 tzdb_0.3.0 fastmap_1.1.0 parallel_4.1.2
## [29] markdown_1.1 fansi_1.0.4 highr_0.9 Rcpp_1.0.10
## [33] cachem_1.0.6 vroom_1.6.1 jsonlite_1.8.4 farver_2.1.0
## [37] bit_4.0.5 hms_1.1.2 digest_0.6.31 stringi_1.7.12
## [41] grid_4.1.2 cli_3.6.0 tools_4.1.2 magrittr_2.0.3
## [45] sass_0.4.0 tibble_3.1.8 crayon_1.5.2 pkgconfig_2.0.3
## [49] ellipsis_0.3.2 xml2_1.3.3 assertthat_0.2.1 rmarkdown_2.20
## [53] rstudioapi_0.13 R6_2.5.1 compiler_4.1.2